To what degree is it possible to predict the number of runs a baseball team will score in a game using known statistical data available before the start of the game?
A final model containing all features. Training set is composed of game-by-game WHIP, DER and OPS. Test set is season to date average WHIP, DER and OPS.
### load packages
# install.packages("FNN")
# install.packages("rpart")
# install.packages("rpart.plot")
# install.packages("rattle")
# install.packages("ggplot2")
# install.packages("knitr")
# install.packages("kableExtra")
# install.packages("gridExtra")
# install.packages("tidyverse")
# install.packages("dplyr")
# install.packages("Boruta")
# install.packages("Metrics")
### set working directory
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("/Users/geoffcooper/Dropbox/CKME136/ckme136/FinalResults_20200727")
wd = getwd()
### open tor_games raw data
datafile = paste(wd,"/tor_games_20200727.csv",sep="")
tor_games = read.csv(datafile)
head(tor_games)
tor_games_training <- data.frame(tor_games$tor_runs,tor_games$game_park_factor,tor_games$tor_home,tor_games$opp_pitcher_whip_game,tor_games$opp_team_der_game,tor_games$drurb001_ops_game,tor_games$bichb001_ops_game,tor_games$biggc002_ops_game,tor_games$davij007_ops_game,tor_games$fishd001_ops_game,tor_games$galvf001_ops_game,tor_games$gricr001_ops_game,tor_games$guerv002_ops_game,tor_games$hernt002_ops_game,tor_games$gurrl001_ops_game,tor_games$jansd001_ops_game,tor_games$maill001_ops_game,tor_games$mcgur002_ops_game,tor_games$mckib001_ops_game,tor_games$pillk001_ops_game,tor_games$smoaj001_ops_game,tor_games$sogae001_ops_game,tor_games$tellr001_ops_game)
names(tor_games_training) <- c('tor_runs','game_park_factor','tor_home','opp_pitcher_whip','opp_team_der','drurb001_ops','bichb001_ops','biggc002_ops','davij007_ops','fishd001_ops','galvf001_ops','gricr001_ops','guerv002_ops','hernt002_ops','gurrl001_ops','jansd001_ops','maill001_ops','mcgur002_ops','mckib001_ops','pillk001_ops','smoaj001_ops','sogae001_ops','tellr001_ops')
head(tor_games_training)
library(dplyr)
tor_games_test <- data.frame(tor_games$game_date,tor_games$tor_runs,tor_games$game_park_factor,tor_games$tor_home,tor_games$opp_pitcher_whip_todate,tor_games$opp_team_der_todate,tor_games$drurb001_ops_todate,tor_games$bichb001_ops_todate,tor_games$biggc002_ops_todate,tor_games$davij007_ops_todate,tor_games$fishd001_ops_todate,tor_games$galvf001_ops_todate,tor_games$gricr001_ops_todate,tor_games$guerv002_ops_todate,tor_games$hernt002_ops_todate,tor_games$gurrl001_ops_todate,tor_games$jansd001_ops_todate,tor_games$maill001_ops_todate,tor_games$mcgur002_ops_todate,tor_games$mckib001_ops_todate,tor_games$pillk001_ops_todate,tor_games$smoaj001_ops_todate,tor_games$sogae001_ops_todate,tor_games$tellr001_ops_todate)
names(tor_games_test) <- c('game_date','tor_runs','game_park_factor','tor_home','opp_pitcher_whip','opp_team_der','drurb001_ops','bichb001_ops','biggc002_ops','davij007_ops','fishd001_ops','galvf001_ops','gricr001_ops','guerv002_ops','hernt002_ops','gurrl001_ops','jansd001_ops','maill001_ops','mcgur002_ops','mckib001_ops','pillk001_ops','smoaj001_ops','sogae001_ops','tellr001_ops')
tor_games_test <- tor_games_test %>% filter(game_date > 20190409)
tor_games_test = subset(tor_games_test, select = -c(game_date))
head(tor_games_test)
library(ggplot2)
ggplot(tor_games_training, aes(x=tor_runs, y=..count..)) + geom_histogram(binwidth=1, alpha=0.5, color="darkblue", fill="lightblue") + geom_density(alpha=0, color="darkblue") + labs(title="Histogram of Toronto Blue Jays runs scored (y) - 2019 season") + geom_vline(aes(xintercept=mean(tor_runs)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(tor_runs), label=round(mean(tor_runs),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
require(gridExtra)
whip_training <- ggplot(tor_games_training, aes(x=opp_pitcher_whip, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.5,4.5),breaks=seq(0,4,0.5)) + scale_y_continuous(limits=c(0,60),breaks=seq(0,50,10)) + geom_density(aes(y=0.25*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition pitcher\nWHIP (x3) - Training set (by game)") + geom_vline(aes(xintercept=mean(opp_pitcher_whip)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_pitcher_whip), label=round(mean(opp_pitcher_whip),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
whip_test <- ggplot(tor_games_test, aes(x=opp_pitcher_whip, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.5,4.5),breaks=seq(0,4,0.5)) + scale_y_continuous(limits=c(0,60),breaks=seq(0,50,10)) + geom_density(aes(y=0.25*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition pitcher\nWHIP (x3) - Test set (average to date)") + geom_vline(aes(xintercept=mean(opp_pitcher_whip)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_pitcher_whip), label=round(mean(opp_pitcher_whip),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(whip_training, whip_test, ncol=2)
require(gridExtra)
der_training <- ggplot(tor_games_training, aes(x=opp_team_der, y=..count..)) + geom_histogram(binwidth=0.1, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(0.4,1.1),breaks=seq(0.5,1,0.1)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,70,10)) + geom_density(aes(y=0.1*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition team DER\n(x4) - Training set (by game)") + geom_vline(aes(xintercept=mean(opp_team_der)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_team_der), label=round(mean(opp_team_der),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
der_test <- ggplot(tor_games_test, aes(x=opp_team_der, y=..count..)) + geom_histogram(binwidth=0.02, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(0.4,1.1),breaks=seq(0.5,1,0.1)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,70,10)) + geom_density(aes(y=0.02*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition team DER\n(x4) - Test set (average to date)") + geom_vline(aes(xintercept=mean(opp_team_der)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_team_der), label=round(mean(opp_team_der),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(der_training, der_test, ncol=2)
require(gridExtra)
biggc002_training <- ggplot(tor_games_training, aes(x=biggc002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,90,10)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Cavan Biggio OPS\n(x7) - Training set (by game)") + geom_vline(aes(xintercept=mean(biggc002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(biggc002_ops), label=round(mean(biggc002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
biggc002_test <- ggplot(tor_games_test, aes(x=biggc002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,90,10)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Cavan Biggio OPS\n(x7) - Test set (average to date)") + geom_vline(aes(xintercept=mean(biggc002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(biggc002_ops), label=round(mean(biggc002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(biggc002_training, biggc002_test, ncol=2)
require(gridExtra)
gricr001_training <- ggplot(tor_games_training, aes(x=gricr001_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,540),breaks=seq(0,160,40)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Randal Grichuk OPS\n(x11) - Training set (by game)") + geom_vline(aes(xintercept=mean(gricr001_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(gricr001_ops), label=round(mean(gricr001_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
gricr001_test <- ggplot(tor_games_test, aes(x=gricr001_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,540),breaks=seq(0,160,40)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Randal Grichuk OPS\n(x11) - Test set (average to date)") + geom_vline(aes(xintercept=mean(gricr001_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(gricr001_ops), label=round(mean(gricr001_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(gricr001_training, gricr001_test, ncol=2)
require(gridExtra)
guerv002_training <- ggplot(tor_games_training, aes(x=guerv002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,130),breaks=seq(0,120,20)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Vladimir Guerrero Jr\nOPS (x12) - Training set (by game)") + geom_vline(aes(xintercept=mean(guerv002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(guerv002_ops), label=round(mean(guerv002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
guerv002_test <- ggplot(tor_games_test, aes(x=guerv002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,130),breaks=seq(0,120,20)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Vladimir Guerrero Jr\nOPS (x12) - Test set (average to date)") + geom_vline(aes(xintercept=mean(guerv002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(guerv002_ops), label=round(mean(guerv002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(guerv002_training, guerv002_test, ncol=2)
models_compare_training <- data.frame("Model"=character(),"Correlation_r"=double(),"Root_mean_squared_error"=double(),"Error_distribution"=character())
models_compare_test <- data.frame("Model"=character(),"Correlation_r"=double(),"Root_mean_squared_error"=double(),"Error_distribution"=character())
model_linreg <- lm(tor_runs~game_park_factor+tor_home+opp_pitcher_whip+opp_team_der+drurb001_ops+bichb001_ops+biggc002_ops+davij007_ops+fishd001_ops+galvf001_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops+maill001_ops+mcgur002_ops+mckib001_ops+pillk001_ops+smoaj001_ops+sogae001_ops+tellr001_ops, tor_games_training)
summary(model_linreg)
##
## Call:
## lm(formula = tor_runs ~ game_park_factor + tor_home + opp_pitcher_whip +
## opp_team_der + drurb001_ops + bichb001_ops + biggc002_ops +
## davij007_ops + fishd001_ops + galvf001_ops + gricr001_ops +
## guerv002_ops + hernt002_ops + gurrl001_ops + jansd001_ops +
## maill001_ops + mcgur002_ops + mckib001_ops + pillk001_ops +
## smoaj001_ops + sogae001_ops + tellr001_ops, data = tor_games_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9750 -1.0246 -0.0079 1.0842 6.2977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.04572 2.19414 -0.477 0.634397
## game_park_factor 1.64124 1.52906 1.073 0.284968
## tor_home 0.06331 0.30299 0.209 0.834797
## opp_pitcher_whip 0.61667 0.24050 2.564 0.011406 *
## opp_team_der -3.55534 1.90159 -1.870 0.063634 .
## drurb001_ops 1.05968 0.26672 3.973 0.000114 ***
## bichb001_ops 0.97259 0.33756 2.881 0.004590 **
## biggc002_ops 0.96562 0.21894 4.411 2.05e-05 ***
## davij007_ops 1.37884 0.56310 2.449 0.015585 *
## fishd001_ops 0.65759 0.45278 1.452 0.148661
## galvf001_ops 0.69456 0.23043 3.014 0.003063 **
## gricr001_ops 0.85121 0.22450 3.792 0.000222 ***
## guerv002_ops 1.04550 0.23294 4.488 1.49e-05 ***
## hernt002_ops 0.97801 0.17225 5.678 7.66e-08 ***
## gurrl001_ops 1.08369 0.24157 4.486 1.51e-05 ***
## jansd001_ops 1.03540 0.26243 3.945 0.000126 ***
## maill001_ops 1.43475 0.48841 2.938 0.003874 **
## mcgur002_ops 0.29216 0.35494 0.823 0.411848
## mckib001_ops 1.15146 0.29050 3.964 0.000118 ***
## pillk001_ops 7.22145 3.56851 2.024 0.044922 *
## smoaj001_ops 0.36627 0.23319 1.571 0.118533
## sogae001_ops 0.89263 0.26014 3.431 0.000791 ***
## tellr001_ops 0.98589 0.23834 4.136 6.08e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.721 on 139 degrees of freedom
## Multiple R-squared: 0.7555, Adjusted R-squared: 0.7168
## F-statistic: 19.53 on 22 and 139 DF, p-value: < 2.2e-16
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.0.2
linreg_predict_training <- predict(model_linreg, tor_games_training)
linreg_actuals_preds_training <- data.frame(cbind(actuals=tor_games_training$tor_runs, predicteds=linreg_predict_training))
correlation_accuracy_training <- cor(linreg_actuals_preds_training$actuals, y=linreg_actuals_preds_training$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_training)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_training
## [1] 0.8729085
linreg_preds_errors <- (linreg_actuals_preds_training$actuals - linreg_actuals_preds_training$predicteds)
linreg_rmse = rmse(linreg_actuals_preds_training$actuals, linreg_actuals_preds_training$predicteds)
print("Root mean squared error")
## [1] "Root mean squared error"
linreg_rmse
## [1] 1.594344
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="Linear regression","Correlation_r"=correlation_accuracy_training,"Root_mean_squared_error"=linreg_rmse,"Error_distribution"=linreg_preds_errors_dist))
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Training)","Correlation_r"=correlation_accuracy_training,"Root_mean_squared_error"=linreg_rmse,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_training$actuals, linreg_actuals_preds_training$predicteds, main="Plot of runs/game - Linear regression model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_training$actuals~linreg_actuals_preds_training$predicteds), col="red")
library(ggplot2)
actuals <- data.frame(tor_games_training$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_training)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nLinear regression model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()
library(dplyr)
library(FNN)
knn_tor_games_actuals <- tor_games_training %>% select(tor_runs)
knn_tor_games_training <- tor_games_training %>% select(-tor_runs)
model_knn <- knn.reg(knn_tor_games_training, test=NULL, knn_tor_games_actuals, k=12)
model_knn
## PRESS = 1125.403
## R2-Predict = NA
library(Metrics)
knn_cor <- cor(knn_tor_games_actuals$tor_runs, y=model_knn$pred, method=c("spearman"))
# head(model_knn$pred)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
knn_cor
## [1] 0.7536747
knn_preds_errors <- (knn_tor_games_actuals$tor_runs - model_knn$pred)
knn_rmse = rmse(knn_tor_games_actuals$tor_runs, model_knn$pred)
print("Root mean squared error")
## [1] "Root mean squared error"
knn_rmse
## [1] 2.635703
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(knn_preds_errors)
knn_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
knn_preds_errors_dist
## [1] "Not normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="k-NN regression","Correlation_r"=knn_cor,"Root_mean_squared_error"=knn_rmse,"Error_distribution"=knn_preds_errors_dist))
plot(knn_tor_games_actuals$tor_runs, model_knn$pred, main="Plot of runs/game - k-NN regression model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(model_knn$pred~knn_tor_games_actuals$tor_runs), col="red")
library(ggplot2)
actuals <- data.frame(knn_tor_games_actuals$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(model_knn$pred)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nk-NN regression model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()
library(rpart)
model_regtree <- rpart(tor_runs~game_park_factor+tor_home+opp_pitcher_whip+opp_team_der+drurb001_ops+bichb001_ops+biggc002_ops+davij007_ops+fishd001_ops+galvf001_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops+maill001_ops+mcgur002_ops+mckib001_ops+pillk001_ops+smoaj001_ops+sogae001_ops+tellr001_ops, method="anova", data=tor_games_training)
summary(model_regtree)
## Call:
## rpart(formula = tor_runs ~ game_park_factor + tor_home + opp_pitcher_whip +
## opp_team_der + drurb001_ops + bichb001_ops + biggc002_ops +
## davij007_ops + fishd001_ops + galvf001_ops + gricr001_ops +
## guerv002_ops + hernt002_ops + gurrl001_ops + jansd001_ops +
## maill001_ops + mcgur002_ops + mckib001_ops + pillk001_ops +
## smoaj001_ops + sogae001_ops + tellr001_ops, data = tor_games_training,
## method = "anova")
## n= 162
##
## CP nsplit rel error xerror xstd
## 1 0.35721276 0 1.0000000 1.0104050 0.15005391
## 2 0.10122683 1 0.6427872 0.7489114 0.11961378
## 3 0.07836614 2 0.5415604 0.6990594 0.11651070
## 4 0.03420656 3 0.4631943 0.5989963 0.08902639
## 5 0.02999152 4 0.4289877 0.6554938 0.09522038
## 6 0.02440076 5 0.3989962 0.6842977 0.09706661
## 7 0.02103845 6 0.3745954 0.6471846 0.09468305
## 8 0.01662439 7 0.3535570 0.6162963 0.08675508
## 9 0.01175462 8 0.3369326 0.6146063 0.08646160
## 10 0.01077065 9 0.3251780 0.6178646 0.08633277
## 11 0.01000000 10 0.3144073 0.6170655 0.08632511
##
## Variable importance
## opp_pitcher_whip opp_team_der biggc002_ops guerv002_ops
## 36 22 10 5
## smoaj001_ops gricr001_ops drurb001_ops fishd001_ops
## 4 4 3 3
## mcgur002_ops jansd001_ops galvf001_ops game_park_factor
## 3 3 1 1
## tor_home gurrl001_ops sogae001_ops tellr001_ops
## 1 1 1 1
## bichb001_ops
## 1
##
## Node number 1: 162 observations, complexity param=0.3572128
## mean=4.481481, MSE=10.39781
## left son=2 (113 obs) right son=3 (49 obs)
## Primary splits:
## opp_pitcher_whip < 1.55 to the left, improve=0.3572128, (0 missing)
## opp_team_der < 0.7245 to the right, improve=0.2791761, (0 missing)
## guerv002_ops < 1.05 to the left, improve=0.2106361, (0 missing)
## biggc002_ops < 0.55 to the left, improve=0.1808705, (0 missing)
## hernt002_ops < 0.775 to the left, improve=0.1527696, (0 missing)
## Surrogate splits:
## opp_team_der < 0.587 to the right, agree=0.747, adj=0.163, (0 split)
## drurb001_ops < 1.325 to the left, agree=0.728, adj=0.102, (0 split)
## biggc002_ops < 1.8165 to the left, agree=0.728, adj=0.102, (0 split)
## smoaj001_ops < 1.883 to the left, agree=0.728, adj=0.102, (0 split)
## jansd001_ops < 1.775 to the left, agree=0.722, adj=0.082, (0 split)
##
## Node number 2: 113 observations, complexity param=0.1012268
## mean=3.212389, MSE=5.671705
## left son=4 (71 obs) right son=5 (42 obs)
## Primary splits:
## opp_team_der < 0.7235 to the right, improve=0.2660482, (0 missing)
## biggc002_ops < 1.3335 to the left, improve=0.2056460, (0 missing)
## guerv002_ops < 1.05 to the left, improve=0.1748234, (0 missing)
## opp_pitcher_whip < 1.134 to the left, improve=0.1571339, (0 missing)
## gricr001_ops < 0.7915 to the left, improve=0.1481078, (0 missing)
## Surrogate splits:
## biggc002_ops < 0.775 to the left, agree=0.735, adj=0.286, (0 split)
## opp_pitcher_whip < 1.0625 to the left, agree=0.699, adj=0.190, (0 split)
## guerv002_ops < 1.05 to the left, agree=0.681, adj=0.143, (0 split)
## mcgur002_ops < 0.25 to the left, agree=0.673, adj=0.119, (0 split)
## gricr001_ops < 0.55 to the left, agree=0.664, adj=0.095, (0 split)
##
## Node number 3: 49 observations, complexity param=0.07836614
## mean=7.408163, MSE=9.017076
## left son=6 (42 obs) right son=7 (7 obs)
## Primary splits:
## opp_team_der < 0.558 to the right, improve=0.2987606, (0 missing)
## hernt002_ops < 0.8165 to the left, improve=0.2228298, (0 missing)
## guerv002_ops < 1.0165 to the left, improve=0.2214362, (0 missing)
## gricr001_ops < 1.256 to the left, improve=0.1493609, (0 missing)
## opp_pitcher_whip < 2.9635 to the left, improve=0.1382294, (0 missing)
##
## Node number 4: 71 observations, complexity param=0.02440076
## mean=2.267606, MSE=3.041063
## left son=8 (52 obs) right son=9 (19 obs)
## Primary splits:
## gricr001_ops < 0.7915 to the left, improve=0.1903603, (0 missing)
## opp_pitcher_whip < 0.845 to the left, improve=0.1667133, (0 missing)
## hernt002_ops < 1.125 to the left, improve=0.1464811, (0 missing)
## jansd001_ops < 0.3665 to the left, improve=0.1226668, (0 missing)
## opp_team_der < 0.8975 to the right, improve=0.0766997, (0 missing)
## Surrogate splits:
## opp_pitcher_whip < 1.21 to the left, agree=0.845, adj=0.421, (0 split)
## sogae001_ops < 1.4585 to the left, agree=0.803, adj=0.263, (0 split)
## game_park_factor < 0.9 to the right, agree=0.746, adj=0.053, (0 split)
## bichb001_ops < 0.125 to the left, agree=0.746, adj=0.053, (0 split)
##
## Node number 5: 42 observations, complexity param=0.03420656
## mean=4.809524, MSE=6.058957
## left son=10 (35 obs) right son=11 (7 obs)
## Primary splits:
## biggc002_ops < 1.3335 to the left, improve=0.2264222, (0 missing)
## game_park_factor < 1.047 to the left, improve=0.2253941, (0 missing)
## guerv002_ops < 1.05 to the left, improve=0.2077189, (0 missing)
## opp_team_der < 0.613 to the right, improve=0.1383982, (0 missing)
## galvf001_ops < 1.175 to the left, improve=0.1197605, (0 missing)
##
## Node number 6: 42 observations, complexity param=0.02999152
## mean=6.738095, MSE=5.431406
## left son=12 (7 obs) right son=13 (35 obs)
## Primary splits:
## fishd001_ops < 0.125 to the right, improve=0.2214591, (0 missing)
## guerv002_ops < 1.3 to the left, improve=0.2214591, (0 missing)
## mcgur002_ops < 0.45 to the right, improve=0.1964096, (0 missing)
## bichb001_ops < 0.2 to the right, improve=0.1344972, (0 missing)
## gricr001_ops < 1.256 to the left, improve=0.1237658, (0 missing)
## Surrogate splits:
## mcgur002_ops < 0.45 to the right, agree=0.952, adj=0.714, (0 split)
## bichb001_ops < 0.2 to the right, agree=0.857, adj=0.143, (0 split)
## biggc002_ops < 1.919 to the right, agree=0.857, adj=0.143, (0 split)
##
## Node number 7: 7 observations
## mean=11.42857, MSE=11.67347
##
## Node number 8: 52 observations
## mean=1.807692, MSE=2.039941
##
## Node number 9: 19 observations
## mean=3.526316, MSE=3.617729
##
## Node number 10: 35 observations, complexity param=0.01662439
## mean=4.285714, MSE=4.37551
## left son=20 (25 obs) right son=21 (10 obs)
## Primary splits:
## guerv002_ops < 0.825 to the left, improve=0.18285450, (0 missing)
## gricr001_ops < 1.3415 to the left, improve=0.16791040, (0 missing)
## game_park_factor < 1.0245 to the left, improve=0.15236320, (0 missing)
## sogae001_ops < 0.883 to the left, improve=0.14109140, (0 missing)
## hernt002_ops < 0.9165 to the left, improve=0.09444963, (0 missing)
## Surrogate splits:
## fishd001_ops < 0.125 to the left, agree=0.800, adj=0.3, (0 split)
## game_park_factor < 1.047 to the left, agree=0.771, adj=0.2, (0 split)
## opp_team_der < 0.568 to the right, agree=0.771, adj=0.2, (0 split)
## galvf001_ops < 1.175 to the left, agree=0.771, adj=0.2, (0 split)
## gricr001_ops < 0.125 to the right, agree=0.771, adj=0.2, (0 split)
##
## Node number 11: 7 observations
## mean=7.428571, MSE=6.244898
##
## Node number 12: 7 observations
## mean=4.285714, MSE=0.7755102
##
## Node number 13: 35 observations, complexity param=0.02103845
## mean=7.228571, MSE=4.919184
## left son=26 (15 obs) right son=27 (20 obs)
## Primary splits:
## guerv002_ops < 0.3 to the left, improve=0.20583030, (0 missing)
## gricr001_ops < 0.55 to the left, improve=0.12809200, (0 missing)
## hernt002_ops < 1.1 to the left, improve=0.12391030, (0 missing)
## biggc002_ops < 0.95 to the left, improve=0.11745480, (0 missing)
## smoaj001_ops < 0.225 to the right, improve=0.05837042, (0 missing)
## Surrogate splits:
## smoaj001_ops < 0.625 to the right, agree=0.714, adj=0.333, (0 split)
## tellr001_ops < 0.325 to the right, agree=0.686, adj=0.267, (0 split)
## opp_team_der < 0.712 to the right, agree=0.657, adj=0.200, (0 split)
## galvf001_ops < 0.9 to the right, agree=0.657, adj=0.200, (0 split)
## gricr001_ops < 0.1 to the left, agree=0.657, adj=0.200, (0 split)
##
## Node number 20: 25 observations, complexity param=0.01077065
## mean=3.72, MSE=3.6416
## left son=40 (12 obs) right son=41 (13 obs)
## Primary splits:
## game_park_factor < 1.0245 to the left, improve=0.1992812, (0 missing)
## guerv002_ops < 0.2 to the right, improve=0.1549448, (0 missing)
## tor_home < 0.5 to the left, improve=0.1470140, (0 missing)
## gricr001_ops < 0.7915 to the left, improve=0.1230257, (0 missing)
## biggc002_ops < 0.45 to the right, improve=0.1080146, (0 missing)
## Surrogate splits:
## tor_home < 0.5 to the left, agree=0.92, adj=0.833, (0 split)
## guerv002_ops < 0.45 to the right, agree=0.68, adj=0.333, (0 split)
## biggc002_ops < 0.325 to the right, agree=0.64, adj=0.250, (0 split)
## davij007_ops < 0.4585 to the right, agree=0.64, adj=0.250, (0 split)
## galvf001_ops < 0.125 to the right, agree=0.64, adj=0.250, (0 split)
##
## Node number 21: 10 observations
## mean=5.7, MSE=3.41
##
## Node number 26: 15 observations
## mean=6.066667, MSE=1.262222
##
## Node number 27: 20 observations, complexity param=0.01175462
## mean=8.1, MSE=5.89
## left son=54 (9 obs) right son=55 (11 obs)
## Primary splits:
## opp_pitcher_whip < 2.155 to the right, improve=0.1680815, (0 missing)
## hernt002_ops < 0.9 to the left, improve=0.1613650, (0 missing)
## gricr001_ops < 0.55 to the left, improve=0.1369553, (0 missing)
## mckib001_ops < 0.225 to the left, improve=0.1189162, (0 missing)
## jansd001_ops < 0.575 to the left, improve=0.0994235, (0 missing)
## Surrogate splits:
## gurrl001_ops < 0.8165 to the left, agree=0.80, adj=0.556, (0 split)
## galvf001_ops < 0.775 to the left, agree=0.75, adj=0.444, (0 split)
## hernt002_ops < 0.343 to the left, agree=0.70, adj=0.333, (0 split)
## jansd001_ops < 0.575 to the left, agree=0.70, adj=0.333, (0 split)
## mckib001_ops < 0.525 to the right, agree=0.70, adj=0.333, (0 split)
##
## Node number 40: 12 observations
## mean=2.833333, MSE=1.805556
##
## Node number 41: 13 observations
## mean=4.538462, MSE=3.940828
##
## Node number 54: 9 observations
## mean=7, MSE=5.333333
##
## Node number 55: 11 observations
## mean=9, MSE=4.545455
library(rpart.plot)
library(rattle)
fancyRpartPlot(model_regtree, main="Regression tree model")
library(Metrics)
regtree_preds <- predict(model_regtree,type = "vector")
# head(regtree_preds)
regtree_cor <- cor(tor_games_training$tor_runs, y=regtree_preds, method=c("spearman"))
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
regtree_cor
## [1] 0.813598
regtree_preds_errors <- (tor_games_training$tor_runs - regtree_preds)
regtree_rmse = rmse(tor_games_training$tor_runs, regtree_preds)
print("Root mean squared error")
## [1] "Root mean squared error"
regtree_rmse
## [1] 1.808078
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(regtree_preds_errors)
regtree_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
regtree_preds_errors_dist
## [1] "Not normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="Regression tree","Correlation_r"=regtree_cor,"Root_mean_squared_error"=regtree_rmse,"Error_distribution"=regtree_preds_errors_dist))
plot(tor_games_training$tor_runs, jitter(regtree_preds,2), main="Plot of runs/game - Regression tree model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(regtree_preds~tor_games_training$tor_runs), col="red")
library(ggplot2)
actuals <- data.frame(tor_games_training$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(regtree_preds)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nRegression tree model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()
library(knitr)
library(kableExtra)
models_compare_training %>% kable() %>% kable_styling()
| Model | Correlation_r | Root_mean_squared_error | Error_distribution |
|---|---|---|---|
| Linear regression | 0.8729085 | 1.594344 | Normal |
| k-NN regression | 0.7536747 | 2.635703 | Not normal |
| Regression tree | 0.8135980 | 1.808078 | Not normal |
library(Metrics)
linreg_predict_test <- predict(model_linreg, tor_games_test)
linreg_actuals_preds_test <- data.frame(cbind(actuals=tor_games_test$tor_runs, predicteds=linreg_predict_test))
correlation_accuracy_test <- cor(linreg_actuals_preds_test$actuals, y=linreg_actuals_preds_test$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_test)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_test
## [1] 0.23505
linreg_preds_errors <- (linreg_actuals_preds_test$actuals - linreg_actuals_preds_test$predicteds)
linreg_rmse = rmse(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds)
print("Root mean squared error")
## [1] "Root mean squared error"
linreg_rmse
## [1] 3.1885
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Not normal"
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Test - baseline)","Correlation_r"=correlation_accuracy_test,"Root_mean_squared_error"=linreg_rmse,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds, main="Plot of runs/game - Linear regression model - Test data - baseline", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_test$actuals~linreg_actuals_preds_test$predicteds), col="red")
actuals <- data.frame(tor_games_test$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_test)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nActual vs Predicted - Test Set - baseline",x="Runs", y="Games") + theme_classic()
library(Boruta)
boruta_output <- Boruta(tor_runs~., data=tor_games_training, doTrace=0, maxRuns=500)
boruta_output
## Boruta performed 312 iterations in 9.817127 secs.
## 12 attributes confirmed important: biggc002_ops, drurb001_ops,
## game_park_factor, gricr001_ops, guerv002_ops and 7 more;
## 10 attributes confirmed unimportant: bichb001_ops, davij007_ops,
## fishd001_ops, galvf001_ops, maill001_ops and 5 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
boruta_table <- attStats(boruta_output)
boruta_table
### remove outlier, one game where runs > 13
tor_games_training <- tor_games_training %>% filter(tor_runs < 13)
tor_games_test <- tor_games_test %>% filter(tor_runs < 13)
model_linreg_final <- lm(tor_runs~opp_pitcher_whip+opp_team_der+biggc002_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops, tor_games_training)
summary(model_linreg_final)
##
## Call:
## lm(formula = tor_runs ~ opp_pitcher_whip + opp_team_der + biggc002_ops +
## gricr001_ops + guerv002_ops + hernt002_ops + gurrl001_ops +
## jansd001_ops, data = tor_games_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3417 -1.2306 -0.4019 1.2832 5.3150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9452 1.5713 3.784 0.000222 ***
## opp_pitcher_whip 1.2809 0.2283 5.611 9.28e-08 ***
## opp_team_der -7.0450 1.8123 -3.887 0.000151 ***
## biggc002_ops 0.6065 0.2167 2.799 0.005789 **
## gricr001_ops 0.5769 0.2291 2.518 0.012837 *
## guerv002_ops 0.6686 0.2367 2.825 0.005368 **
## hernt002_ops 0.6735 0.1789 3.764 0.000238 ***
## gurrl001_ops 0.8806 0.2351 3.746 0.000254 ***
## jansd001_ops 0.5427 0.2644 2.052 0.041872 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.887 on 152 degrees of freedom
## Multiple R-squared: 0.6325, Adjusted R-squared: 0.6132
## F-statistic: 32.7 on 8 and 152 DF, p-value: < 2.2e-16
library(Metrics)
linreg_predict_final <- predict(model_linreg_final, tor_games_test)
linreg_actuals_preds_test <- data.frame(cbind(actuals=tor_games_test$tor_runs, predicteds=linreg_predict_final))
correlation_accuracy_test <- cor(linreg_actuals_preds_test$actuals, y=linreg_actuals_preds_test$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_test)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_test
## [1] 0.2550327
linreg_preds_errors <- (linreg_actuals_preds_test$actuals - linreg_actuals_preds_test$predicteds)
linreg_rmse = rmse(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds)
print("Root mean squared error")
## [1] "Root mean squared error"
linreg_rmse
## [1] 2.943055
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Not normal"
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Test - final)","Correlation_r"=correlation_accuracy_test,"Root_mean_squared_error"=linreg_rmse,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds, main="Plot of runs/game - Linear regression model - Test data - final", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_test$actuals~linreg_actuals_preds_test$predicteds), col="red")
actuals <- data.frame(tor_games_test$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_final)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nActual vs Predicted - Test Set - final",x="Runs", y="Games") + theme_classic()
library(knitr)
library(kableExtra)
models_compare_test %>% kable() %>% kable_styling()
| Model | Correlation_r | Root_mean_squared_error | Error_distribution |
|---|---|---|---|
| Linear regression (Training) | 0.8729085 | 1.594344 | Normal |
| Linear regression (Test - baseline) | 0.2350500 | 3.188500 | Not normal |
| Linear regression (Test - final) | 0.2550327 | 2.943055 | Not normal |